{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入pandas、requests_html的HTMLSession模块\n",
    "\n",
    "import pandas as pd \n",
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath解析html单一页面\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/?key=广告优化师\" # 设置关键词：广告优化师\n",
    "session = HTMLSession()\n",
    "r = session.get( url )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 先取特定元素, 精准打击其子后辈\n",
    "\n",
    "主要元素 = r.html.xpath( \\\n",
    "    '//ul[@class=\"sojob-list\"]/li')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 作为xpath字典，键为我要抓的牛肉名称，值为xpath\n",
    "\n",
    "dict_xpaths={ \n",
    "    'text': {\n",
    "        'edu':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]',\n",
    "        '经验':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]/following-sibling::span',\n",
    "        '薪水':    '//div[contains(@class,\"job-info\")]/p/span[@class=\"text-warning\"]', \n",
    "        '时间':    '//div[contains(@class,\"job-info\")]/p/time/@title', \n",
    "        '职称':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "        '公司地点': '//div[contains(@class,\"job-info\")]/p/a',\n",
    "        '公司名称': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "    },\n",
    "    'text_content': {\n",
    "    },\n",
    "    'href': {\n",
    "        '链结':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "        '公司URL': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义三个函数\n",
    "\n",
    "def get_e_text_content(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]\n",
    "    return(暂存结果)\n",
    "\n",
    "def get_e_text(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [\"\".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]\n",
    "    return(暂存结果)\n",
    "\n",
    "def get_e_href(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \\\n",
    "               if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \\\n",
    "               else \"\" for e in 主要元素]\n",
    "    return(暂存结果)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>edu</th>\n",
       "      <th>经验</th>\n",
       "      <th>薪水</th>\n",
       "      <th>时间</th>\n",
       "      <th>职称</th>\n",
       "      <th>公司地点</th>\n",
       "      <th>公司名称</th>\n",
       "      <th>链结</th>\n",
       "      <th>公司URL</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>10-15k·12薪</td>\n",
       "      <td>2020年07月14日</td>\n",
       "      <td>广告优化师（头条，广点通）</td>\n",
       "      <td>上海</td>\n",
       "      <td>上海嵩恒网络科技股份有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929908883.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7877259/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>12-23k·12薪</td>\n",
       "      <td>2020年07月15日</td>\n",
       "      <td>海外广告优化师/广告优化师/APP推广</td>\n",
       "      <td>广州-天河区</td>\n",
       "      <td>广州安悦网络科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929355323.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9665352/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11-12k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>教育行业广告优化师（腾讯）</td>\n",
       "      <td>北京-双榆树</td>\n",
       "      <td>人瑞集团</td>\n",
       "      <td>https://www.liepin.com/job/1930040621.shtml</td>\n",
       "      <td>https://www.liepin.com/company/3346592/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11-12k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告数据优化师（腾讯）</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>人瑞集团</td>\n",
       "      <td>https://www.liepin.com/job/1930040533.shtml</td>\n",
       "      <td>https://www.liepin.com/company/3346592/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>10-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td>上海-闵行区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1930025801.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>8-10k·13薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>海外媒体广告优化师</td>\n",
       "      <td>郑州-郑东新区</td>\n",
       "      <td>麒麟合盛网络技术股份有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929901343.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8454950/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>12-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>北京</td>\n",
       "      <td>深圳英派科特广告传媒有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929697177.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8945895/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>6-10k·14薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>Facebook广告优化师</td>\n",
       "      <td>深圳-宝安区</td>\n",
       "      <td>深圳市傲雷电商科技股份有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929443955.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10282081/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>6-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>青岛-市南区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1929269859.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>6-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>郑州-北林路</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1928996111.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>15-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>杭州</td>\n",
       "      <td>杭州河象网络科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928899813.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9888227/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>6-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师/SEM/信息流</td>\n",
       "      <td>广州-天河区</td>\n",
       "      <td>广州智颜科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928776813.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9313498/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>8-12k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td></td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1928662927.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>8-15k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td></td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1928662917.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>上海-闵行区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1928479597.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>大客户广告优化师</td>\n",
       "      <td>杭州-江干区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1928291533.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>商业化广告优化师</td>\n",
       "      <td></td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1928034971.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8-16k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>南京-江东</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1928025147.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>大客户广告优化师</td>\n",
       "      <td>福州-台江区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1927633629.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>大客户广告优化师</td>\n",
       "      <td>厦门-思明区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1927633627.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>南阳</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927633617.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>郑州-金水区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927633615.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>新乡</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927633613.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>济南-历下区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927588249.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>临沂</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927588247.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td>临沂</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927588245.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td></td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927533169.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>泉州</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927301005.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td>佛山-禅城区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927222959.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927222945.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>商业化-广告优化师</td>\n",
       "      <td>东莞</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927195855.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>商业化-广告优化师</td>\n",
       "      <td>佛山-禅城区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1927195601.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>电商广告优化师</td>\n",
       "      <td>温州-鹿城区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1926886533.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>信息流广告优化师</td>\n",
       "      <td>杭州-余杭区</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1926846015.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>上海-龙华</td>\n",
       "      <td>今日头条</td>\n",
       "      <td>https://www.liepin.com/job/1926816489.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9630160/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1921812823.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>国内游戏广告优化师</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1921812821.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>海外游戏广告优化师</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1921812817.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>20-30k·14薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>Facebook广告优化师</td>\n",
       "      <td></td>\n",
       "      <td>某知名互联网公司</td>\n",
       "      <td>https://www.liepin.com/a/21198795.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20-25k·13薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>资深sem广告优化师</td>\n",
       "      <td>上海,武汉,杭州</td>\n",
       "      <td>国内某知名大型化日化集团公司</td>\n",
       "      <td>https://www.liepin.com/a/21091859.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      edu    经验          薪水           时间                   职称      公司地点  \\\n",
       "0   本科及以上  3-5年  10-15k·12薪  2020年07月14日        广告优化师（头条，广点通）        上海   \n",
       "1    学历不限  经验不限  12-23k·12薪  2020年07月15日  海外广告优化师/广告优化师/APP推广    广州-天河区   \n",
       "2    统招本科  1-3年  11-12k·12薪  2020年07月18日        教育行业广告优化师（腾讯）    北京-双榆树   \n",
       "3    统招本科  1-3年  11-12k·12薪  2020年07月18日          广告数据优化师（腾讯）    深圳-南山区   \n",
       "4   本科及以上  经验不限  10-20k·12薪  2020年07月18日              电商广告优化师    上海-闵行区   \n",
       "5   本科及以上  3-5年   8-10k·13薪  2020年07月18日            海外媒体广告优化师   郑州-郑东新区   \n",
       "6   大专及以上  3-5年  12-20k·12薪  2020年07月18日                广告优化师        北京   \n",
       "7   大专及以上  1-3年   6-10k·14薪  2020年07月18日        Facebook广告优化师    深圳-宝安区   \n",
       "8   大专及以上  经验不限   6-10k·12薪  2020年07月18日             信息流广告优化师    青岛-市南区   \n",
       "9   本科及以上  经验不限   6-10k·12薪  2020年07月18日                广告优化师    郑州-北林路   \n",
       "10   学历不限  经验不限  15-20k·12薪  2020年07月18日                广告优化师        杭州   \n",
       "11  大专及以上  1-3年   6-10k·12薪  2020年07月18日        广告优化师/SEM/信息流    广州-天河区   \n",
       "12   统招本科  经验不限   8-12k·12薪  2020年07月18日             信息流广告优化师             \n",
       "13   统招本科  经验不限   8-15k·12薪  2020年07月18日              电商广告优化师             \n",
       "14  本科及以上  1-3年          面议  2020年07月18日             信息流广告优化师    上海-闵行区   \n",
       "15  本科及以上  3-5年          面议  2020年07月18日             大客户广告优化师    杭州-江干区   \n",
       "16  大专及以上  1-3年          面议  2020年07月18日             商业化广告优化师             \n",
       "17  本科及以上  1-3年   8-16k·12薪  2020年07月18日             信息流广告优化师     南京-江东   \n",
       "18  本科及以上  3-5年          面议  2020年07月18日             大客户广告优化师    福州-台江区   \n",
       "19  本科及以上  3-5年          面议  2020年07月18日             大客户广告优化师    厦门-思明区   \n",
       "20  本科及以上  3-5年          面议  2020年07月18日             信息流广告优化师        南阳   \n",
       "21  本科及以上  1-3年          面议  2020年07月18日             信息流广告优化师    郑州-金水区   \n",
       "22  本科及以上  3-5年          面议  2020年07月18日             信息流广告优化师        新乡   \n",
       "23  本科及以上  1-3年          面议  2020年07月18日             信息流广告优化师    济南-历下区   \n",
       "24  本科及以上  1-3年          面议  2020年07月18日             信息流广告优化师        临沂   \n",
       "25  本科及以上  1-3年          面议  2020年07月18日              电商广告优化师        临沂   \n",
       "26  本科及以上  1-3年          面议  2020年07月18日              电商广告优化师             \n",
       "27  本科及以上  1-3年          面议  2020年07月18日                广告优化师        泉州   \n",
       "28  本科及以上  1-3年          面议  2020年07月18日              电商广告优化师    佛山-禅城区   \n",
       "29  本科及以上  1-3年          面议  2020年07月18日              电商广告优化师    深圳-南山区   \n",
       "30  本科及以上  1-3年          面议  2020年07月18日            商业化-广告优化师        东莞   \n",
       "31  本科及以上  1-3年          面议  2020年07月18日            商业化-广告优化师    佛山-禅城区   \n",
       "32  本科及以上  1-3年          面议  2020年07月18日              电商广告优化师    温州-鹿城区   \n",
       "33  本科及以上  1-3年          面议  2020年07月18日             信息流广告优化师    杭州-余杭区   \n",
       "34  大专及以上  1-3年          面议  2020年07月18日                广告优化师     上海-龙华   \n",
       "35  本科及以上  1-3年          面议  2020年07月18日                广告优化师    深圳-南山区   \n",
       "36  本科及以上  1-3年          面议  2020年07月18日            国内游戏广告优化师    深圳-南山区   \n",
       "37  本科及以上  1-3年          面议  2020年07月18日            海外游戏广告优化师    深圳-南山区   \n",
       "38   学历不限  经验不限  20-30k·14薪  2020年07月18日        Facebook广告优化师             \n",
       "39   学历不限  3-5年  20-25k·13薪  2020年07月18日           资深sem广告优化师  上海,武汉,杭州   \n",
       "\n",
       "               公司名称                                           链结  \\\n",
       "0    上海嵩恒网络科技股份有限公司  https://www.liepin.com/job/1929908883.shtml   \n",
       "1      广州安悦网络科技有限公司  https://www.liepin.com/job/1929355323.shtml   \n",
       "2              人瑞集团  https://www.liepin.com/job/1930040621.shtml   \n",
       "3              人瑞集团  https://www.liepin.com/job/1930040533.shtml   \n",
       "4              字节跳动  https://www.liepin.com/job/1930025801.shtml   \n",
       "5    麒麟合盛网络技术股份有限公司  https://www.liepin.com/job/1929901343.shtml   \n",
       "6    深圳英派科特广告传媒有限公司  https://www.liepin.com/job/1929697177.shtml   \n",
       "7   深圳市傲雷电商科技股份有限公司  https://www.liepin.com/job/1929443955.shtml   \n",
       "8              今日头条  https://www.liepin.com/job/1929269859.shtml   \n",
       "9              今日头条  https://www.liepin.com/job/1928996111.shtml   \n",
       "10     杭州河象网络科技有限公司  https://www.liepin.com/job/1928899813.shtml   \n",
       "11       广州智颜科技有限公司  https://www.liepin.com/job/1928776813.shtml   \n",
       "12             今日头条  https://www.liepin.com/job/1928662927.shtml   \n",
       "13             今日头条  https://www.liepin.com/job/1928662917.shtml   \n",
       "14             字节跳动  https://www.liepin.com/job/1928479597.shtml   \n",
       "15             字节跳动  https://www.liepin.com/job/1928291533.shtml   \n",
       "16             字节跳动  https://www.liepin.com/job/1928034971.shtml   \n",
       "17             字节跳动  https://www.liepin.com/job/1928025147.shtml   \n",
       "18             字节跳动  https://www.liepin.com/job/1927633629.shtml   \n",
       "19             字节跳动  https://www.liepin.com/job/1927633627.shtml   \n",
       "20             今日头条  https://www.liepin.com/job/1927633617.shtml   \n",
       "21             今日头条  https://www.liepin.com/job/1927633615.shtml   \n",
       "22             今日头条  https://www.liepin.com/job/1927633613.shtml   \n",
       "23             今日头条  https://www.liepin.com/job/1927588249.shtml   \n",
       "24             今日头条  https://www.liepin.com/job/1927588247.shtml   \n",
       "25             今日头条  https://www.liepin.com/job/1927588245.shtml   \n",
       "26             今日头条  https://www.liepin.com/job/1927533169.shtml   \n",
       "27             今日头条  https://www.liepin.com/job/1927301005.shtml   \n",
       "28             今日头条  https://www.liepin.com/job/1927222959.shtml   \n",
       "29             今日头条  https://www.liepin.com/job/1927222945.shtml   \n",
       "30             今日头条  https://www.liepin.com/job/1927195855.shtml   \n",
       "31             今日头条  https://www.liepin.com/job/1927195601.shtml   \n",
       "32             今日头条  https://www.liepin.com/job/1926886533.shtml   \n",
       "33             今日头条  https://www.liepin.com/job/1926846015.shtml   \n",
       "34             今日头条  https://www.liepin.com/job/1926816489.shtml   \n",
       "35             字节跳动  https://www.liepin.com/job/1921812823.shtml   \n",
       "36             字节跳动  https://www.liepin.com/job/1921812821.shtml   \n",
       "37             字节跳动  https://www.liepin.com/job/1921812817.shtml   \n",
       "38         某知名互联网公司      https://www.liepin.com/a/21198795.shtml   \n",
       "39   国内某知名大型化日化集团公司      https://www.liepin.com/a/21091859.shtml   \n",
       "\n",
       "                                       公司URL  \n",
       "0    https://www.liepin.com/company/7877259/  \n",
       "1    https://www.liepin.com/company/9665352/  \n",
       "2    https://www.liepin.com/company/3346592/  \n",
       "3    https://www.liepin.com/company/3346592/  \n",
       "4    https://www.liepin.com/company/7863078/  \n",
       "5    https://www.liepin.com/company/8454950/  \n",
       "6    https://www.liepin.com/company/8945895/  \n",
       "7   https://www.liepin.com/company/10282081/  \n",
       "8    https://www.liepin.com/company/9630160/  \n",
       "9    https://www.liepin.com/company/9630160/  \n",
       "10   https://www.liepin.com/company/9888227/  \n",
       "11   https://www.liepin.com/company/9313498/  \n",
       "12   https://www.liepin.com/company/9630160/  \n",
       "13   https://www.liepin.com/company/9630160/  \n",
       "14   https://www.liepin.com/company/7863078/  \n",
       "15   https://www.liepin.com/company/7863078/  \n",
       "16   https://www.liepin.com/company/7863078/  \n",
       "17   https://www.liepin.com/company/7863078/  \n",
       "18   https://www.liepin.com/company/7863078/  \n",
       "19   https://www.liepin.com/company/7863078/  \n",
       "20   https://www.liepin.com/company/9630160/  \n",
       "21   https://www.liepin.com/company/9630160/  \n",
       "22   https://www.liepin.com/company/9630160/  \n",
       "23   https://www.liepin.com/company/9630160/  \n",
       "24   https://www.liepin.com/company/9630160/  \n",
       "25   https://www.liepin.com/company/9630160/  \n",
       "26   https://www.liepin.com/company/9630160/  \n",
       "27   https://www.liepin.com/company/9630160/  \n",
       "28   https://www.liepin.com/company/9630160/  \n",
       "29   https://www.liepin.com/company/9630160/  \n",
       "30   https://www.liepin.com/company/9630160/  \n",
       "31   https://www.liepin.com/company/9630160/  \n",
       "32   https://www.liepin.com/company/9630160/  \n",
       "33   https://www.liepin.com/company/9630160/  \n",
       "34   https://www.liepin.com/company/9630160/  \n",
       "35   https://www.liepin.com/company/7863078/  \n",
       "36   https://www.liepin.com/company/7863078/  \n",
       "37   https://www.liepin.com/company/7863078/  \n",
       "38                                            \n",
       "39                                            "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 只对主要元素下进行.xpath取值\n",
    "\n",
    "数据字典 = dict()\n",
    "\n",
    "数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}\n",
    "数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})\n",
    "数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})\n",
    "\n",
    "[len(v) for k,v in 数据字典.items()]\n",
    "\n",
    "数据 = pd.DataFrame(数据字典)\n",
    "#数据.to_excel(\"20春_Web数据挖掘_week03_liepin.xlsx\", sheet_name=\"搜查结果\") 暂时不用输出excel表格\n",
    "数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath翻页处理\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/?key=广告优化师\"\n",
    "session = HTMLSession()\n",
    "r = session.get( url )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=2'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=3'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=4'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1'>, <Element 'a' class=('last',) href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=9' title='末页'>]\n",
      "{'2': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1', '3': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=2', '4': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=3', '5': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=4', '下一页': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1', '': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=9'}\n"
     ]
    }
   ],
   "source": [
    "#xpath 解析翻页a/@href\n",
    "\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a' # 有disabled, current等href是javascript\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "print (r.html.xpath(xpath_翻页a)) # 物件\n",
    "\n",
    "href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "#print (href_列表)\n",
    "\n",
    "文字_列表 = [x.text for x in r.html.xpath(xpath_翻页a)]\n",
    "#print (文字_列表)\n",
    "\n",
    "href_字典 = {x.text:x.xpath('//@href')[0]  for x in r.html.xpath(xpath_翻页a)}\n",
    "print (href_字典)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>scheme</th>\n",
       "      <th>netloc</th>\n",
       "      <th>path</th>\n",
       "      <th>params</th>\n",
       "      <th>query</th>\n",
       "      <th>fragment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  scheme netloc       path params  \\\n",
       "0                /zhaopin/          \n",
       "1                /zhaopin/          \n",
       "2                /zhaopin/          \n",
       "3                /zhaopin/          \n",
       "4                /zhaopin/          \n",
       "5                /zhaopin/          \n",
       "\n",
       "                                               query fragment  \n",
       "0  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "1  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "2  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "3  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "4  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "5  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scheme      1\n",
      "netloc      1\n",
      "path        1\n",
      "params      1\n",
      "query       5\n",
      "fragment    1\n",
      "dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>curPage</th>\n",
       "      <th>d_ckId</th>\n",
       "      <th>d_curPage</th>\n",
       "      <th>d_headId</th>\n",
       "      <th>d_pageSize</th>\n",
       "      <th>d_sfrom</th>\n",
       "      <th>key</th>\n",
       "      <th>pageSize</th>\n",
       "      <th>siTag</th>\n",
       "      <th>sortFlag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>9</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>0</td>\n",
       "      <td>d749492eec21f921cb9113962c217219</td>\n",
       "      <td>40</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>广告优化师</td>\n",
       "      <td>40</td>\n",
       "      <td>8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  curPage                            d_ckId d_curPage  \\\n",
       "0       1  d749492eec21f921cb9113962c217219         0   \n",
       "1       2  d749492eec21f921cb9113962c217219         0   \n",
       "2       3  d749492eec21f921cb9113962c217219         0   \n",
       "3       4  d749492eec21f921cb9113962c217219         0   \n",
       "4       1  d749492eec21f921cb9113962c217219         0   \n",
       "5       9  d749492eec21f921cb9113962c217219         0   \n",
       "\n",
       "                           d_headId d_pageSize         d_sfrom    key  \\\n",
       "0  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "1  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "2  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "3  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "4  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "5  d749492eec21f921cb9113962c217219         40  search_unknown  广告优化师   \n",
       "\n",
       "  pageSize                                          siTag     sortFlag  \n",
       "0       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  \n",
       "1       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  \n",
       "2       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  \n",
       "3       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  \n",
       "4       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  \n",
       "5       40  8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw  °radeFlag=0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "curPage       5\n",
      "d_ckId        1\n",
      "d_curPage     1\n",
      "d_headId      1\n",
      "d_pageSize    1\n",
      "d_sfrom       1\n",
      "key           1\n",
      "pageSize      1\n",
      "siTag         1\n",
      "sortFlag      1\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# 建构参数模板：找到关键参数及参数结构\n",
    "\n",
    "from urllib.parse import urlparse, parse_qs\n",
    "import pandas as pd\n",
    "from IPython.display import display, HTML\n",
    "\n",
    "# 总体目标：输入 href_列表, 建构出参数字典\n",
    "\n",
    "# urlparse 解析后丢入数据框\n",
    "\n",
    "df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "\n",
    "display(df)\n",
    "print(df.nunique())\n",
    "display(df_qs)\n",
    "print(df_qs.nunique())\n",
    "\n",
    "df_qs.curPage\n",
    "df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'pageSize': ['40'], 'sortFlag': ['°radeFlag=0'], 'key': ['广告优化师'], 'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['d749492eec21f921cb9113962c217219'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['d749492eec21f921cb9113962c217219'], 'curPage': ['1']}\n",
      "{'2': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1', '3': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=2', '4': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=3', '5': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=4', '下一页': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=1', '': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E5%B9%BF%E5%91%8A%E4%BC%98%E5%8C%96%E5%B8%88&siTag=8kCkdyv_J0vVPjlz-2K_rQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d749492eec21f921cb9113962c217219&d_curPage=0&d_pageSize=40&d_headId=d749492eec21f921cb9113962c217219&curPage=9'}\n"
     ]
    }
   ],
   "source": [
    "# 建构参数模板：curPage\n",
    "# 建构参数模板：找到关键参数及参数结构\n",
    "\n",
    "def parse_url_qs_for_curPage (url):\n",
    "    six_parts = urlparse(url) \n",
    "    out = parse_qs(six_parts.query)\n",
    "    return (out)\n",
    "\n",
    "# 取一例做模板\n",
    "参数模板 = parse_url_qs_for_curPage(href_列表[0])\n",
    "print (参数模板)\n",
    "\n",
    "print (href_字典)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "9\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'2': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': ['2'],\n",
       "  'keyword': ['广告优化师']},\n",
       " '3': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': ['3'],\n",
       "  'keyword': ['广告优化师']},\n",
       " '4': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': ['4'],\n",
       "  'keyword': ['广告优化师']},\n",
       " '5': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': ['5'],\n",
       "  'keyword': ['广告优化师']},\n",
       " '下一页': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': ['下一页'],\n",
       "  'keyword': ['广告优化师']},\n",
       " '': {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['广告优化师'],\n",
       "  'siTag': ['8kCkdyv_J0vVPjlz-2K_rQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d749492eec21f921cb9113962c217219'],\n",
       "  'curPage': [''],\n",
       "  'keyword': ['广告优化师']}}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 建构参数模板生成器：keyword curPage\n",
    "def 参数模板生成(keyword, curPage):\n",
    "    参数 = 参数模板.copy()\n",
    "    参数['curPage'] = curPage\n",
    "    参数['keyword'] = keyword\n",
    "    return (参数)\n",
    "\n",
    "参数_keyword_广告优化师_curPage = { \n",
    "    i:参数模板生成(curPage = [i], \\\n",
    "                  keyword = ['广告优化师']) \\\n",
    "    for i,v in href_字典.items()\\\n",
    "    }\n",
    "\n",
    "# print(参数_keyword_用户体验_curPage) # 只生成本页有的额外翻页URL, 并没有推估到&curPage=9,也没有这页\n",
    "\n",
    "print (df_qs.curPage_int.min()) # 最小值只有1\n",
    "print (df_qs.curPage_int.max()) # 最大值只有9\n",
    "\n",
    "# 应该是 0 (本页)....9(最大值)\n",
    "\n",
    "参数_keyword_用户体验_curPage = { \n",
    "    i:参数模板生成(curPage = [i], \\\n",
    "                  keyword = ['广告优化师']) \\\n",
    "    for i in range(0,df_qs.curPage_int.max()+1)\\\n",
    "    }\n",
    "参数_keyword_广告优化师_curPage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 开始爬虫\n",
    "\n",
    "# 单一页面爬+解析\n",
    "\n",
    "session = HTMLSession()\n",
    "\n",
    "def requests_liepin( url, params):\n",
    "    r = session.get( url , params = payload)\n",
    "\n",
    "    # 先取特定元素, 精准打击其子后辈\n",
    "    \n",
    "    主要元素 = r.html.xpath( '//ul[@class=\"sojob-list\"]/li')\n",
    "\n",
    "    # 作为xpath字典，键为我要抓的牛肉名称，值为xpath\n",
    "    \n",
    "    dict_xpaths={ \n",
    "        'text': {\n",
    "            'edu':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]',\n",
    "            '经验':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]/following-sibling::span',\n",
    "            '薪水':    '//div[contains(@class,\"job-info\")]/p/span[@class=\"text-warning\"]', \n",
    "            '时间':    '//div[contains(@class,\"job-info\")]/p/time/@title', \n",
    "            '职称':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "            '公司地点': '//div[contains(@class,\"job-info\")]/p/a',\n",
    "            '公司名称': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "        },\n",
    "        'text_content': {\n",
    "        },\n",
    "        'href': {\n",
    "            '链结':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "            '公司URL': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "        }\n",
    "    }\n",
    "\n",
    "    def get_e_text_content(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    def get_e_text(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [\"\".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    def get_e_href(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \\\n",
    "                   if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \\\n",
    "                   else \"\" for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    # 只对主要元素下进行.xpath取值\n",
    "    数据字典 = dict()\n",
    "\n",
    "    数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}\n",
    "    数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})\n",
    "    数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})\n",
    "\n",
    "    数据 = pd.DataFrame(数据字典)\n",
    "    #数据.to_excel(\"20春_Web数据挖掘_week03_liepin.xlsx\", sheet_name=\"搜查结果\")\n",
    "    return (数据)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 37.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "\n",
    "list_df = list()\n",
    "for k,v in 参数_keyword_广告优化师_curPage.items():\n",
    "    payload = v\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "    df = df.assign (curPage = k)  # 区分  curPage\n",
    "    list_df.append(df)\n",
    "\n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "# 输出\n",
    "df_all.to_excel(\"广告优化师_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"广告优化师\")\n",
    "\n",
    "# 预估时间: 5秒*10 =50\n",
    "# 预估数量: 40*10 =400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "广告优化师 10\n",
      "电商运营 10\n",
      "Wall time: 2min 1s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 多个页面+多个关键词\n",
    "\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "\n",
    "keywords = ['广告优化师','电商运营']\n",
    "list_df = list()\n",
    "\n",
    "## 第一页试探有多长的页面\n",
    "for key in keywords:\n",
    "    payload = 参数模板生成(keyword=[key], curPage=['0'])\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "    df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数\n",
    "    长度 = df_qs.curPage_int.max()+1\n",
    "    参数_keyword_X_curPage = { \n",
    "        i:参数模板生成(curPage = [i], \\\n",
    "                      keyword = [key]) \\\n",
    "        for i in range(0,长度)\\\n",
    "        }\n",
    "    #print (参数_keyword_X_curPage)\n",
    "    print (key,长度)\n",
    "    \n",
    "    for k,v in 参数_keyword_X_curPage.items():\n",
    "        payload = v\n",
    "        df = requests_liepin( url, params = payload)\n",
    "        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "        df = df.assign (keyword = key)  # 区分  keyword    \n",
    "        df = df.assign (curPage = k)  # 区分  curPage    \n",
    "        list_df.append(df)\n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "df_all.to_excel(\"广告优化师_电商运营_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"_\".join(keywords))\n",
    "# 预估时间: 2*5秒*10 =100\n",
    "# 预估数量: 2*40*10 =800"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 0 ns\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "广告优化师 10\n",
      "电商运营 10\n",
      "直播带货 10\n",
      "短视频 10\n"
     ]
    }
   ],
   "source": [
    "# 多个页面 + 多个关键词执行时，若怕中断最好把每一页的 df 内容备份做中继\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "#多个页面+多个关键词\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "\n",
    "keywords = ['广告优化师','电商运营','直播带货','短视频']\n",
    "list_df = list()\n",
    "\n",
    "## 第一页试探有多长的页面\n",
    "for key in keywords:\n",
    "    payload = 参数模板生成(keyword=[key], curPage=['0'])\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "    df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数\n",
    "    长度 = df_qs.curPage_int.max()+1\n",
    "    参数_keyword_X_curPage = { \n",
    "        i:参数模板生成(curPage = [i], \\\n",
    "                      keyword = [key]) \\\n",
    "        for i in range(0,长度)\\\n",
    "        }\n",
    "    #print (参数_keyword_X_curPage)\n",
    "    print (key,长度)\n",
    "    \n",
    "    for k,v in 参数_keyword_X_curPage.items():\n",
    "        payload = v\n",
    "        df = requests_liepin( url, params = payload)\n",
    "        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "        ## 备份\n",
    "        df.to_csv(\"20春_Web数据挖掘_week04_liepin_{key}_{k}.tsv\"\\\n",
    "                  .format(key=key, k=k), sep=\"\\t\", encoding=\"utf8\")\n",
    "        \n",
    "        df = df.assign (keyword = key)  # 区分  keyword    \n",
    "        df = df.assign (curPage = k)  # 区分  curPage    \n",
    "        list_df.append(df)\n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "df_all.to_excel(\"广告优化师_电商运营_直播带货_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"_\".join(keywords))\n",
    "# 预估时间: 4*5秒*10 =200\n",
    "# 预估数量: 4*40*10 =1600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "广告优化师 10\n",
      "广州 10\n",
      "深圳 10\n",
      "佛山 10\n",
      "珠海 10\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "\n",
    "keywords = ['广告优化师','广州','深圳','佛山','珠海']\n",
    "list_df = list()\n",
    "\n",
    "## 第一页试探有多长的页面\n",
    "for key in keywords:\n",
    "    payload = 参数模板生成(keyword=[key], curPage=['0'])\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "    df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数\n",
    "    长度 = df_qs.curPage_int.max()+1\n",
    "    参数_keyword_X_curPage = { \n",
    "        i:参数模板生成(curPage = [i], \\\n",
    "                      keyword = [key]) \\\n",
    "        for i in range(0,长度)\\\n",
    "        }\n",
    "    #print (参数_keyword_X_curPage)\n",
    "    print (key,长度)\n",
    "    \n",
    "    for k,v in 参数_keyword_X_curPage.items():\n",
    "        payload = v\n",
    "        df = requests_liepin( url, params = payload)\n",
    "        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "        ## 备份\n",
    "        df.to_csv(\"20春_Web数据挖掘_week04_liepin_{key}_{k}.tsv\"\\\n",
    "                  .format(key=key, k=k), sep=\"\\t\", encoding=\"utf8\")\n",
    "        \n",
    "        df = df.assign (keyword = key)  # 区分  keyword    \n",
    "        df = df.assign (curPage = k)  # 区分  curPage    \n",
    "        list_df.append(df)\n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "df_all.to_excel(\"广告优化师_广州_深圳_佛山_珠海_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"_\".join(keywords))\n",
    "# 预估时间: 4*5秒*10 =200\n",
    "# 预估数量: 4*40*10 =1600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
